{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic Regression exercise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "\n",
    "%matplotlib inline\n",
    "plt.rcParams['figure.figsize'] = (15., 12.) # set default size of plots\n",
    "plt.rcParams['image.interpolation'] = 'nearest'\n",
    "plt.rcParams['image.cmap'] = 'gray'\n",
    "\n",
    "# Some more magic so that the notebook will reload external python modules;\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the data for you\n",
    "data = pd.read_csv('./DSVC/datasets/MNIST.csv',header=0).values # change the path by yourself\n",
    "imgs = data[0::,1::]\n",
    "labels = data[::,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize some examples from the dataset.\n",
    "# We show a few examples of training images from each class.\n",
    "classes = range(10)\n",
    "num_classes = len(classes)\n",
    "samples_per_class = 7\n",
    "for y, cls in enumerate(classes):\n",
    "    idxs = np.flatnonzero(labels == y)\n",
    "    idxs = np.random.choice(idxs, samples_per_class, replace=False)\n",
    "    for i, idx in enumerate(idxs):\n",
    "        plt_idx = i * num_classes + y + 1\n",
    "        plt.subplot(samples_per_class, num_classes, plt_idx)\n",
    "        plt.imshow(imgs[idx].reshape(28,28).astype('uint8'))\n",
    "        plt.axis('off')\n",
    "        if i == 0:\n",
    "            plt.title(cls)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Binary classification\n",
    "We use the Logistic Regression to classification handwritten digits wheather it's zero or not. If the handwritten digits is '0' , then the label is 0, otherwise, the label is 1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# transform the labels to binary\n",
    "for i in xrange(len(labels)):\n",
    "    if labels[i] != 0:\n",
    "        labels[i] = 1\n",
    "        \n",
    "# 2/3 training set\n",
    "# 1/3 test set\n",
    "split_index = len(labels) * 2 / 3\n",
    "X_train = imgs[:split_index]\n",
    "y_train = labels[:split_index]\n",
    "X_test = imgs[split_index:]\n",
    "y_test = labels[split_index:]\n",
    "\n",
    "X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])\n",
    "X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])\n",
    "\n",
    "print X_train.shape\n",
    "print X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_feats = None # choose and extract features\n",
    "X_test_feats = None # choose and extract features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize some examples from the dataset.\n",
    "# We show a few examples of training images from each class.\n",
    "classes = range(2)\n",
    "num_classes = len(classes)\n",
    "samples_per_class = 7\n",
    "for y, cls in enumerate(classes):\n",
    "    idxs = np.flatnonzero(labels == y)\n",
    "    idxs = np.random.choice(idxs, samples_per_class, replace=False)\n",
    "    for i, idx in enumerate(idxs):\n",
    "        plt_idx = i * num_classes + y + 1\n",
    "        plt.subplot(samples_per_class, num_classes, plt_idx)\n",
    "        plt.imshow(imgs[idx].reshape(28,28).astype('uint8'))\n",
    "        plt.axis('off')\n",
    "        if i == 0:\n",
    "            plt.title(cls)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from DSVC.classifiers import LogisticRegression\n",
    "\n",
    "# Start training. \n",
    "# You should open DSVC/classifiers/logistic_regression.py and implement the function.\n",
    "# Then run this cell.\n",
    "\n",
    "classifier = LogisticRegression()\n",
    "loss_history = classifier.train(\n",
    "    X_train_feats, \n",
    "    y_train,\n",
    "    learning_rate = 1e-3,\n",
    "    num_iters = 500,\n",
    "    batch_size = 64,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(loss_history)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_test_pred = classifier.predict(X_test_feats)\n",
    "print \"The accuracy socre is \", np.mean(y_test == y_test_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "you should get the accuracy higher than 96%.\n",
    "\n",
    "----\n",
    "\n",
    "### F1-Measure\n",
    "Notice that, if our model always output '1', totally ignoring the input X, we can get a accuracy 90%.So, in this assignment, accuracy is not efficient enough. \n",
    "\n",
    "We will use F1-Measure to evaluate our model.\n",
    "\n",
    "You may need this:\n",
    "[F1-Measure](https://baike.baidu.com/item/f-measure/913107?fr=aladdin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Calculate the precision(准确率), recall(召回率) and F1\n",
    "# important： We should consider label '0' as 'positive' here. \n",
    "# That means 'True positive' ==> '(y_test == 0) and (y_test_pred == 0)'\n",
    "\n",
    "#######Your code here########\n",
    "\n",
    "print precision\n",
    "print recall\n",
    "print 'F1:', precision*recall*2/(precision+recall)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "you should get the F1 higher than 85%."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Multiclass classification\n",
    "\n",
    "Now, we use the Logistic Regression to classification handwritten digits. There are 10 class, from '0' to '9'.\n",
    "\n",
    "\n",
    "Hint: The method \"one vs all\" may helpful. [Here is the introduction to \"one vs all\"](https://msdn.microsoft.com/library/en-us/Dn905887.aspx). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the data for you\n",
    "data = pd.read_csv('./DSVC/datasets/MNIST.csv',header=0).values # change the path by yourself\n",
    "imgs = data[0::,1::]\n",
    "labels = data[::,0]\n",
    "        \n",
    "# 2/3 training set\n",
    "# 1/3 test set\n",
    "split_index = len(labels) * 2 / 3\n",
    "X_train = imgs[:split_index]\n",
    "y_train = labels[:split_index]\n",
    "X_test = imgs[split_index:]\n",
    "y_test = labels[split_index:]\n",
    "\n",
    "X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])\n",
    "X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])\n",
    "\n",
    "print X_train.shape\n",
    "print X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_feats = None # choose and extract features\n",
    "X_test_feats = None # choose and extract features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Start training. \n",
    "# You should update your code in DSVC/classifiers/logistic_regression.py .\n",
    "# Then run this cell.\n",
    "\n",
    "classifier = LogisticRegression()\n",
    "classifier.one_vs_all(\n",
    "    X_train_feats, \n",
    "    y_train,\n",
    "    learning_rate = 1e-3,\n",
    "    num_iters = 500,\n",
    "    batch_size = 64,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# you may change your code in function `predict`\n",
    "y_test_pred = classifier.predict(X_test_feats)\n",
    "print \"The accruacy socre is \", np.mean(y_test == y_test_pred)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
